Setup

First, we need to load in the necessary libraries:

library(arules)
library(arulesViz)
library(dplyr)
library(plotly)
library(data.table)

Then we need to load our dataset, as well as examine it a bit

data("AdultUCI")
dim(AdultUCI)
#> [1] 48842    15
AdultUCI

Next lets get rid of some columns we are not going to use/that has some issues, and then lets break down our numeric variables into factors using cut

data <- AdultUCI[, -c(3, 5, 11:12)]
colnames(data)[colnames(data) == "hours-per-week"] <- "hoursperweek"
data$age <- cut(data$age, breaks = c(15, 25, 45, 65, 100), labels = c("Young", 
    "Middleaged", "Senior", "Retired"))
data$hoursperweek <- cut(data$hoursperweek, breaks = c(0, 20, 40, 60, 80), labels = c("part-time", 
    "full-time", "hard-working", "need-a-life"))
str(data)
#> 'data.frame':    48842 obs. of  11 variables:
#>  $ age           : Factor w/ 4 levels "Young","Middleaged",..: 2 3 2 3 2 2 3 3 2 2 ...
#>  $ workclass     : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
#>  $ education     : Ord.factor w/ 16 levels "Preschool"<"1st-4th"<..: 14 14 9 7 14 15 5 9 15 14 ...
#>  $ marital-status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
#>  $ occupation    : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
#>  $ relationship  : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
#>  $ race          : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
#>  $ sex           : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
#>  $ hoursperweek  : Factor w/ 4 levels "part-time","full-time",..: 2 1 2 2 2 2 1 3 3 2 ...
#>  $ native-country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
#>  $ income        : Ord.factor w/ 2 levels "small"<"large": 1 1 1 1 1 1 1 2 2 2 ...

Next, we convert the data to an object with class transactions, after viewing it again

data
data <- as(data, "transactions")
summary(data)
#> transactions as itemMatrix in sparse format with
#>  48842 rows (elements/itemsets/transactions) and
#>  109 columns (items) and a density of 0.09658517 
#> 
#> most frequent items:
#> native-country=United-States                   race=White 
#>                        43832                        41762 
#>            workclass=Private                     sex=Male 
#>                        33906                        32650 
#>       hoursperweek=full-time                      (Other) 
#>                        30037                       332011 
#> 
#> element (itemset/transaction) length distribution:
#> sizes
#>     7     8     9    10    11 
#>    27   974  2160 15714 29967 
#> 
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>    7.00   10.00   11.00   10.53   11.00   11.00 
#> 
#> includes extended item information - examples:
#>           labels variables     levels
#> 1      age=Young       age      Young
#> 2 age=Middleaged       age Middleaged
#> 3     age=Senior       age     Senior
#> 
#> includes extended transaction information - examples:
#>   transactionID
#> 1             1
#> 2             2
#> 3             3

Exploration

Before we begin with our analysis, lets check out the rule frequencies within the dataset. We are looking for rules with support >= .2

itemFrequencyPlot(data, support = 0.2)

Rule mining

Next, lets mine some rules with the apriori algorithm, and then clean up redundant rules. We are still sorting out what to set the minsupp and minconf to.

zerules <- apriori(data, parameter = list(minlen = 2, supp = 0.2, conf = 0.3), 
    appearance = list(rhs = c("income=small", "income=large"), default = "lhs"), 
    control = list(verbose = F))
length(zerules)
#> [1] 23
redundant <- is.redundant(zerules)
zerules.pruned <- zerules[redundant == FALSE]
rulesorted <- sort(zerules.pruned, by = "lift", decreasing = TRUE)
length(rulesorted)
#> [1] 8

Rule quality and inspection

Next, let us inspect the rules, and examine their quality

(quality(rulesorted))
inspectDT(rulesorted)

Plots

First lets view a scatterplot of our rules

plot(rulesorted, method = "scatterplot", measure = c("confidence", "support"), 
    shading = "lift", engine = "htmlwidget")

Next lets look at a balloon plot

plot(rulesorted, method = "graph", measure = "confidence", shading = "lift", 
    engine = "htmlwidget")

Parallel plot

plot(rulesorted, method = "paracoord", measure = "confidence", shading = "lift", 
    control = list(reorder = T))

Two key plot

plot(rulesorted, method = "two-key plot", measure = "confidence", shading = "lift", 
    engine = "htmlwidget")

grouped plot

plot(rulesorted, method = "grouped", measure = "confidence", shading = "lift")

alternate rule mining

rule2 <- apriori(data, parameter = list(supp = 0.01, conf = 0.5), appearance = list(rhs = c("income=small", 
    "income=large"), default = "lhs"), control = list(verbose = F))
length(rule2)
#> [1] 4115
redundant <- is.redundant(rule2)
rulep <- rule2[redundant == FALSE]
rulesorted2 <- sort(rulep, by = "lift", decreasing = TRUE)
length(rulesorted2)
#> [1] 725

Inspection

head(quality(rulesorted2))
inspectDT(rulesorted2)

Plotting

plot(rulesorted2, method = "scatterplot", measure = c("confidence", "support"), 
    shading = "lift", engine = "htmlwidget")
plot(rulesorted2, method = "graph", measure = "confidence", shading = "lift", 
    engine = "htmlwidget")
plot(rulesorted2, method = "two-key plot", measure = "confidence", shading = "lift", 
    engine = "htmlwidget")
plot(rulesorted2, method = "grouped", measure = "confidence", shading = "lift")